{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jsonlines\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_jsonl(file_paths):\n",
    "    content_id = 0\n",
    "    preprocessed_data = []\n",
    "    questions = []\n",
    "    data = []\n",
    "    question_id = 0\n",
    "    for file_path in file_paths:\n",
    "        with open(file_path, \"r\") as f:\n",
    "            for item in jsonlines.Reader(f):\n",
    "                paper_questions = []\n",
    "                for section in item[\"sections\"]:\n",
    "                    start = section[0]\n",
    "                    end = section[1]\n",
    "                    content = \" \".join(item[\"words\"][start:end])\n",
    "                    preprocessed_data.append({\"id\": \"scirex-{:0>4d}\".format(content_id), \"content\": content})\n",
    "                    content_id += 1\n",
    "                for relation in item[\"n_ary_relations\"]:\n",
    "                    Material = relation[\"Material\"]\n",
    "                    Method = relation[\"Method\"]\n",
    "                    Metric = relation[\"Metric\"]\n",
    "                    Task = relation[\"Task\"]\n",
    "                    score = relation[\"score\"]\n",
    "                    data.append({\"Material\": Material, \"Method\": Method, \"Metric\": Metric, \"Task\": Task, \"score\": score})\n",
    "                    question = \"What is the corresponding {} score of the {} method on {} dataset for {} task?\".format(Metric, Method, Material, Task)\n",
    "                    answer = score\n",
    "                    paper_questions.append({\"qid\": \"\", \"question\": question, \"answer\": answer})\n",
    "                random_index = random.randint(0, len(paper_questions) - 1)\n",
    "                questions.append(paper_questions[random_index])\n",
    "    return questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "file_paths = [\"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/scirex/train.jsonl\", \n",
    "              \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/scirex/dev.jsonl\", \n",
    "              \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/scirex/test.jsonl\"]\n",
    "questions = read_jsonl(file_paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open(\"/<YOUR_OWN_PATH>/ToolQA/data/questions/easy/scirex-easy.jsonl\", mode=\"w\") as writer:\n",
    "    question_indices = random.sample(range(len(questions)), 100)\n",
    "    question_id = 0\n",
    "    for id in question_indices:\n",
    "        row = questions[id]\n",
    "        row[\"qid\"] = \"easy-scirex-{:0>4d}\".format(question_id)\n",
    "        question_id += 1\n",
    "        writer.write(row)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
